Global Deaths Due to Air Pollution

Elizabeth Bekele, Alison Cheek

2022-05-03

Introduction

Packages Required

#This will allow us to filter through our data 
library(tidyverse)
library(dplyr)
#This will help us plot figures to showcase our findings
library(ggplot2)
#This will help us organize and display our data as necessary 
library(knitr)
library(kableExtra)
#This expands our plot uses 
library(plotly)

Pollution Data

Import the deaths-due-to-air-pollution data

deaths_df <- data.frame(read.csv("death-rates-from-air-pollution.csv"))

We are going to rename a few of the columns and glimpse the data

colnames(deaths_df) <- c("country", "acronym", "year", "total_deaths", "indoor_deaths", "outdoor_deaths", "ozone_deaths")

glimpse(deaths_df)
## Rows: 6,468
## Columns: 7
## $ country        <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanist…
## $ acronym        <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG",…
## $ year           <int> 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1…
## $ total_deaths   <dbl> 299.4773, 291.2780, 278.9631, 278.7908, 287.1629, 288.0…
## $ indoor_deaths  <dbl> 250.3629, 242.5751, 232.0439, 231.6481, 238.8372, 239.9…
## $ outdoor_deaths <dbl> 46.44659, 46.03384, 44.24377, 44.44015, 45.59433, 45.36…
## $ ozone_deaths   <dbl> 5.616442, 5.603960, 5.611822, 5.655266, 5.718922, 5.739…

Data Variables

Variables that interest us here include:

World Population Data

Now, let’s take a look at the population data.

world_pop <- read.csv("population_total_long.csv")
glimpse(world_pop)
## Rows: 12,595
## Columns: 3
## $ Country.Name <chr> "Aruba", "Afghanistan", "Angola", "Albania", "Andorra", "…
## $ Year         <int> 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 196…
## $ Count        <int> 54211, 8996973, 5454933, 1608800, 13411, 92418, 20481779,…

To get a general idea of ‘deaths-dataframe’ we made, let’s make a plots to see what’s happening. This is a plot of indoor x outdoor deaths around the world by country.

d <- ggplot(deaths_df, aes(x = indoor_deaths, y = outdoor_deaths, text = paste0(country, ", ", year) )) + geom_point() +
  ggtitle("Outdoor Deaths vs Indoor Deaths")
ggplotly(d)

This is a mess, and so we chose two countries from each continent (a high-population and a low-population country) to graph.

Combine Data Sets

First let’s look at a table of the high and low populated countries using the world population data set.

#selecting high-population countries from the world population data frame 
high_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Year)

head(high_pop_countries)
## # A tibble: 6 × 3
## # Groups:   Year [1]
##   Country.Name   Year     Count
##   <chr>         <int>     <int>
## 1 Australia      1997  18517000
## 2 Brazil         1997 167209040
## 3 Germany        1997  82034771
## 4 Nigeria        1997 113457663
## 5 Pakistan       1997 131057431
## 6 United States  1997 272657000
#selecting low-population countries from the world population data frame 
low_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>% 
  group_by(Year)

head(low_pop_countries)
## # A tibble: 6 × 3
## # Groups:   Year [1]
##   Country.Name  Year    Count
##   <chr>        <int>    <int>
## 1 Canada        1997 29905948
## 2 Chile         1997 14786220
## 3 Sri Lanka     1997 18470900
## 4 Malawi        1997 10264906
## 5 New Zealand   1997  3781300
## 6 Serbia        1997  7596501

Next, we are going to see the death count for high and low populated countries using the deaths dataframe.

#selecting high-population deaths from death dataframe 
high_pop_death <- deaths_df %>% 
  filter(year > 1996 & country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(year)

head(high_pop_death)
## # A tibble: 6 × 7
## # Groups:   year [6]
##   country   acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>     <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Australia AUS      1997         22.4         0.322           21.8        0.314
## 2 Australia AUS      1998         21.5         0.284           21.0        0.305
## 3 Australia AUS      1999         20.4         0.259           19.9        0.295
## 4 Australia AUS      2000         19.4         0.240           18.9        0.290
## 5 Australia AUS      2001         18.6         0.223           18.1        0.284
## 6 Australia AUS      2002         18.1         0.211           17.7        0.286
#selecting low-population deaths from death dataframe 
low_pop_death <- deaths_df %>% 
  filter(year > 1996 & country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(year)

head(low_pop_death)
## # A tibble: 6 × 7
## # Groups:   year [6]
##   country acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>   <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Canada  CAN      1997         21.9        0.0878           19.9         2.20
## 2 Canada  CAN      1998         21.7        0.0824           19.6         2.21
## 3 Canada  CAN      1999         21.2        0.0751           19.2         2.19
## 4 Canada  CAN      2000         20.3        0.0682           18.3         2.13
## 5 Canada  CAN      2001         19.8        0.0641           17.9         2.08
## 6 Canada  CAN      2002         19.5        0.0605           17.7         2.05

Lastly, we will join the population and and deaths with its respected country.

#Combined High Population with Pollution Death
joined_high <- right_join(high_pop_death, high_pop_countries, by= c('country' = 'Country.Name', 'year' = 'Year'))
head(joined_high)
## # A tibble: 6 × 8
## # Groups:   year [6]
##   country   acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>     <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Australia AUS      1997         22.4         0.322           21.8        0.314
## 2 Australia AUS      1998         21.5         0.284           21.0        0.305
## 3 Australia AUS      1999         20.4         0.259           19.9        0.295
## 4 Australia AUS      2000         19.4         0.240           18.9        0.290
## 5 Australia AUS      2001         18.6         0.223           18.1        0.284
## 6 Australia AUS      2002         18.1         0.211           17.7        0.286
## # … with 1 more variable: Count <int>
#Combined Low Population with Pollution Death
joined_low <-right_join(low_pop_death, low_pop_countries, by= c('country' = 'Country.Name', 'year' = 'Year'))
head(joined_low) 
## # A tibble: 6 × 8
## # Groups:   year [6]
##   country acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>   <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Canada  CAN      1997         21.9        0.0878           19.9         2.20
## 2 Canada  CAN      1998         21.7        0.0824           19.6         2.21
## 3 Canada  CAN      1999         21.2        0.0751           19.2         2.19
## 4 Canada  CAN      2000         20.3        0.0682           18.3         2.13
## 5 Canada  CAN      2001         19.8        0.0641           17.9         2.08
## 6 Canada  CAN      2002         19.5        0.0605           17.7         2.05
## # … with 1 more variable: Count <int>

Death Count

Which country has the highest death count?

Let’s make a table depicting the high and low populated countries and their respected death count due to pollution.

## Adding missing grouping variables: `country`
## Adding missing grouping variables: `country`
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383

Here’s a graph to clearly visualize the previous table

Which type of pollution has the greatest number of deaths?

#High Population Pollutant Averages
high_poll <- deaths_df %>% 
  group_by(country) %>% 
  filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  select(country, indoor_deaths, outdoor_deaths, ozone_deaths) %>% 
  summarize(avg_indoor = mean(indoor_deaths), avg_outdoor = mean(outdoor_deaths), avg_ozone = mean(ozone_deaths))

high_poll
## # A tibble: 6 × 4
##   country       avg_indoor avg_outdoor avg_ozone
##   <chr>              <dbl>       <dbl>     <dbl>
## 1 Australia          0.249        17.2     0.360
## 2 Brazil            19.4          26.8     2.74 
## 3 Germany            0.717        25.5     2.34 
## 4 Nigeria           75.9          35.2     2.12 
## 5 Pakistan          87.7          50.5    10.4  
## 6 United States      0.166        22.8     3.92
#Plot High Population
#Indoor Air Pollution
h_indoor<- ggplot(high_poll, aes(x=country, y = avg_indoor)) + 
  geom_point() +
  ggtitle("Average Death due to Indoor Air Pollution for High Population")
ggplotly(h_indoor)
#Outdoor Air Pollution
h_outdoor <- ggplot(high_poll, aes(x=country, y = avg_outdoor, color = avg_ozone)) +
  geom_point() +
  labs(title = "Average Death due to Outdoor Air Pollution for High Population") +
  xlab("Country") + 
  ylab("Average Outdoor Air Pollution Deaths")
ggplotly(h_outdoor)
#Low Population Pollutant Averages 
low_poll <- deaths_df %>% 
  group_by(country) %>% 
  filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  select(country, indoor_deaths, outdoor_deaths, ozone_deaths) %>% 
  summarize(avg_indoor = mean(indoor_deaths), avg_outdoor = mean(outdoor_deaths), avg_ozone = mean(ozone_deaths))
low_poll
## # A tibble: 6 × 4
##   country     avg_indoor avg_outdoor avg_ozone
##   <chr>            <dbl>       <dbl>     <dbl>
## 1 Canada          0.0651        16.4    1.97  
## 2 Chile           8.69          27.2    0.850 
## 3 Malawi        132.            13.8    3.39  
## 4 New Zealand     0.291         15.6    0.0728
## 5 Serbia         35.9           42.7    2.94  
## 6 Sri Lanka      44.5           24.8    0.430
#Plot Low Population
#Indoor Air Pollution
l_indoor<- ggplot(low_poll, aes(x=country, y = avg_indoor)) + 
  geom_point() +
  ggtitle("Average Death due to Indoor Air Pollution for Low Population")
ggplotly(l_indoor)
#Outdoor Air Pollution
l_outdoor <- ggplot(low_poll, aes(x=country, y = avg_outdoor, color = avg_ozone)) +
  geom_point() +
  labs(title = "Average Death due to Outdoor Air Pollution for Low Population", subtitle = "Colored by Average Ozone Air Pollution Deaths")
ggplotly(l_outdoor)

Pollution Over Time

Which year had the worst pollution?

Which year had the worst indoor? Outdoor particulate? Outdoor ozone?

Which is worse - outdoor or indoor pollution?

First, we split the data into high and low population based on country

Low population = high population * .10

#selecting high-population countries from the world population data frame 
high_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Year)

high_pop_countries
## # A tibble: 126 × 3
## # Groups:   Year [21]
##    Country.Name   Year     Count
##    <chr>         <int>     <int>
##  1 Australia      1997  18517000
##  2 Brazil         1997 167209040
##  3 Germany        1997  82034771
##  4 Nigeria        1997 113457663
##  5 Pakistan       1997 131057431
##  6 United States  1997 272657000
##  7 Australia      1998  18711000
##  8 Brazil         1998 169785250
##  9 Germany        1998  82047195
## 10 Nigeria        1998 116319759
## # … with 116 more rows
#selecting low-population countries from the world population data frame 
low_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>% 
  group_by(Year)

low_pop_countries
## # A tibble: 126 × 3
## # Groups:   Year [21]
##    Country.Name  Year    Count
##    <chr>        <int>    <int>
##  1 Canada        1997 29905948
##  2 Chile         1997 14786220
##  3 Sri Lanka     1997 18470900
##  4 Malawi        1997 10264906
##  5 New Zealand   1997  3781300
##  6 Serbia        1997  7596501
##  7 Canada        1998 30155173
##  8 Chile         1998 14977733
##  9 Sri Lanka     1998 18564599
## 10 Malawi        1998 10552338
## # … with 116 more rows
#Mean total deaths from 1996-2017 of high-population countries
deaths_highpop_countries <- deaths_df %>% 
  filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_high = mean(total_deaths))
## Adding missing grouping variables: `country`
#Mean total deaths from 1990-2017 of high-population countries
deaths_lowpop_countries<- deaths_df %>% 
  filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_low = mean(total_deaths))
## Adding missing grouping variables: `country`
#death_lowpop_countries
kable(list(deaths_highpop_countries, deaths_lowpop_countries))
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383
ggplot(deaths_highpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_high))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in high-population countries")+
  coord_flip()

ggplot(deaths_lowpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_low))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in low-population countries")+
  coord_flip()

This shows us the deaths due to pollution, but what about the average population of those countries at that time?

hp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia'), Year > 1996) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#hp_countries_population

lp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'), Year > 1996) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#Population Average Table
kable(list(hp_countries_population, lp_countries_population))
Country.Name average_population
Australia 21217772
Brazil 189132292
Germany 81914540
Nigeria 148549958
Pakistan 168525322
United States 300447600
Country.Name average_population
Canada 33029774
Chile 16555805
Malawi 13605376
New Zealand 4214995
Serbia 7345882
Sri Lanka 19824652
#Graph of Population Average
ggplot(hp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average high-population countries")+
  coord_flip()

ggplot(lp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average low-population countries")+
  coord_flip()

Summary